#' Original outlierKD function by By Klodian Dhana,
#' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
#' Modified to have third argument for removing outliers instead of interactive prompt,
#' and after removing outlier, original df will not be changed. The function returns the a df,
#' which can be saved as original df name if desired.
#' Also added QQ-plot in the output, with options to show/hide boxplot, histogram, qqplot.
#' Check outliers, and option to remove them, save as a new dataframe.
#' @param df The dataframe.
#' @param var The variable in the dataframe to be checked for outliers
#' @param rm Boolean. Whether to remove outliers or not.
#' @param boxplt Boolean. Whether to show the boxplot, before and after outliers removed.
#' @param histogram Boolean. Whether to show the histogram, before and after outliers removed.
#' @param qqplt Boolean. Whether to show the qqplot, before and after outliers removed.
#' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
#' @examples
#' outlierKD2(mydf, height, FALSE, TRUE, TRUE, TRUE)
#' mydf = outlierKD2(mydf, height, TRUE, TRUE, TRUE, TRUE)
#' mydfnew = outlierKD2(mydf, height, TRUE)
#' @export
outlierKD2 <- function(df, var, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE) {
dt = df # duplicate the dataframe for potential alteration
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
colTotal <- boxplt+histogram+qqplt
par(mfrow=c(2, max(2,colTotal)), oma=c(0,0,3,0)) # fixed issue with only 0 or 1 chart selected
if (qqplt) {
qqnorm(var_name, main = "With outliers")
qqline(var_name)
}
if (histogram) { hist(var_name, main="With outliers", xlab=NA, ylab=NA) }
if (boxplt) { boxplot(var_name, main="With outliers") }
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
if (qqplt) {
qqnorm(var_name, main = "Without outliers")
qqline(var_name)
}
if (histogram) { hist(var_name, main="Without outliers", xlab=NA, ylab=NA) }
if (boxplt) { boxplot(var_name, main="Without outliers") }
if(colTotal > 0) { # if no charts are wanted, skip this section
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "\n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
cat("Mean of the outliers:", round(mo, 2), "\n")
m2 <- mean(var_name, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "\n")
cat("Mean if we remove outliers:", round(m2, 2), "\n")
}
# response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
# if(response == "y" | response == "yes"){
if(rm){
dt[as.character(substitute(var))] <- invisible(var_name)
#assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "\n")
return(invisible(dt))
} else {
cat("Nothing changed", "\n")
return(invisible(df))
}
}
There is considerable evidence indicating lending disparities throughout the United States. (Steil et. al: 2018). For our research topic, we will explore lending practices in one of the fastest appreciating real estate markets over the past thirty years - the state of California. Specifically, we aim to look at the factors that are associated with denials for non-commercial mortgage loans.
Our SMART question is:
“Which factors drove denials for mortgages in California in 2019?”
To answer this question, we are using the Federal Financial Institutions Examination Council’s (FFIEC) Home Mortgage Disclosure Act (HMDA) dataset from 2019, located here: https://ffiec.cfpb.gov/data-publication/dynamic-national-loan-level-dataset/2019. We are focusing on a subset of data of 10,000 observations from 2019 that we will further filter on California leaving us with 5,196 observations.
Our Github repository address is:https://github.com/brandonchin19/Team3/.
hmda_ca <- data.frame(read.csv("hmda_ca_new.csv"))
str(hmda_ca)
## 'data.frame': 50000 obs. of 99 variables:
## $ activity_year : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
## $ lei : chr "549300HN58ONH5KNJJ12" "549300ZPP46PTHQ2RF22" "549300ZPP46PTHQ2RF22" "549300ZPP46PTHQ2RF22" ...
## $ derived_msa.md : int 31084 42220 42220 42220 42220 42220 42220 42220 42220 39820 ...
## $ state_code : chr "CA" "CA" "CA" "CA" ...
## $ county_code : int 6037 6097 6097 6097 6097 6097 6097 6097 6097 6089 ...
## $ census_tract : num 6.04e+09 6.10e+09 6.10e+09 6.10e+09 6.10e+09 ...
## $ conforming_loan_limit : chr "NC" "NC" "C" "C" ...
## $ derived_loan_product_type : chr "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" ...
## $ derived_dwelling_category : chr "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" ...
## $ derived_ethnicity : chr "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" ...
## $ derived_race : chr "White" "White" "White" "Race Not Available" ...
## $ derived_sex : chr "Male" "Male" "Male" "Female" ...
## $ action_taken : int 1 1 1 1 1 1 1 1 1 1 ...
## $ purchaser_type : int 9 71 71 71 71 71 71 71 71 71 ...
## $ preapproval : int 1 2 2 2 2 2 2 2 2 2 ...
## $ loan_type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ loan_purpose : int 1 32 1 1 32 32 1 31 32 31 ...
## $ lien_status : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reverse_mortgage : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ open.end_line_of_credit : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ business_or_commercial_purpose : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ loan_amount : int 1125000 875000 375000 605000 1335000 405000 105000 265000 355000 185000 ...
## $ loan_to_value_ratio : chr "80" "Exempt" "Exempt" "Exempt" ...
## $ interest_rate : chr "5.25" "Exempt" "Exempt" "Exempt" ...
## $ rate_spread : chr NA "Exempt" "Exempt" "Exempt" ...
## $ hoepa_status : int 2 2 2 2 2 2 2 2 2 2 ...
## $ total_loan_costs : chr "16765" "Exempt" "Exempt" "Exempt" ...
## $ total_points_and_fees : chr NA "Exempt" "Exempt" "Exempt" ...
## $ origination_charges : chr "11200" "Exempt" "Exempt" "Exempt" ...
## $ discount_points : chr NA "Exempt" "Exempt" "Exempt" ...
## $ lender_credits : chr NA "Exempt" "Exempt" "Exempt" ...
## $ loan_term : chr "360" "Exempt" "Exempt" "Exempt" ...
## $ prepayment_penalty_term : chr NA "Exempt" "Exempt" "Exempt" ...
## $ intro_rate_period : chr NA "Exempt" "Exempt" "Exempt" ...
## $ negative_amortization : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ interest_only_payment : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ balloon_payment : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ other_nonamortizing_features : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ property_value : chr "1405000" "Exempt" "Exempt" "Exempt" ...
## $ construction_method : int 1 1 1 1 1 1 2 1 1 1 ...
## $ occupancy_type : int 1 1 1 1 1 1 1 1 1 2 ...
## $ manufactured_home_secured_property_type : int 3 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ manufactured_home_land_property_interest: int 5 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ total_units : chr "1" "1" "1" "1" ...
## $ multifamily_affordable_units : chr NA "Exempt" "Exempt" "Exempt" ...
## $ income : int 432 254 109 134 363 140 56 60 61 355 ...
## $ debt_to_income_ratio : chr "20%-<30%" "Exempt" "Exempt" "Exempt" ...
## $ applicant_credit_score_type : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ co.applicant_credit_score_type : int 10 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ applicant_ethnicity.1 : int 2 2 2 2 2 1 1 1 2 2 ...
## $ applicant_ethnicity.2 : int NA NA NA NA NA 2 NA NA NA NA ...
## $ applicant_ethnicity.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_ethnicity.4 : logi NA NA NA NA NA NA ...
## $ applicant_ethnicity.5 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.1 : int 5 5 5 5 2 1 1 5 5 2 ...
## $ co.applicant_ethnicity.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.3 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.4 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.5 : logi NA NA NA NA NA NA ...
## $ applicant_ethnicity_observed : int 2 1 3 3 3 3 2 3 3 3 ...
## $ co.applicant_ethnicity_observed : int 4 4 4 4 3 3 2 4 4 3 ...
## $ applicant_race.1 : int 5 5 5 6 5 6 5 5 5 5 ...
## $ applicant_race.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.5 : logi NA NA NA NA NA NA ...
## $ co.applicant_race.1 : int 8 8 8 8 5 6 5 8 8 5 ...
## $ co.applicant_race.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ co.applicant_race.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ co.applicant_race.4 : logi NA NA NA NA NA NA ...
## $ co.applicant_race.5 : logi NA NA NA NA NA NA ...
## $ applicant_race_observed : int 2 1 3 3 3 3 2 3 3 3 ...
## $ co.applicant_race_observed : int 4 4 4 4 3 3 2 4 4 3 ...
## $ applicant_sex : int 1 1 1 2 1 1 2 2 2 1 ...
## $ co.applicant_sex : int 5 5 5 5 1 2 1 5 5 2 ...
## $ applicant_sex_observed : int 2 1 3 3 3 3 2 3 3 3 ...
## $ co.applicant_sex_observed : int 4 4 4 4 3 3 2 4 4 3 ...
## $ applicant_age : chr "35-44" "65-74" "35-44" "25-34" ...
## $ co.applicant_age : chr "9999" "9999" "9999" "9999" ...
## $ applicant_age_above_62 : chr "No" "Yes" "No" "No" ...
## $ co.applicant_age_above_62 : chr NA NA NA NA ...
## $ submission_of_application : int 1 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ initially_payable_to_institution : int 2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ aus.1 : int 6 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ aus.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.5 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.1 : int 10 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
## $ denial_reason.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.4 : logi NA NA NA NA NA NA ...
## $ tract_population : int 4162 4094 12736 5746 4286 12736 3272 4309 3932 1581 ...
## $ tract_minority_population_percent : num 16.2 20.1 58.6 33.8 14.6 ...
## $ ffiec_msa_md_median_family_income : int 73100 93300 93300 93300 93300 93300 93300 93300 93300 61900 ...
## $ tract_to_msa_income_percentage : int 249 124 79 118 94 79 101 122 76 90 ...
## $ tract_owner_occupied_units : int 1369 1211 1960 1430 1435 1960 790 1249 1048 516 ...
## $ tract_one_to_four_family_homes : int 1635 1801 3422 2173 2459 3422 1235 1562 1884 1094 ...
## $ tract_median_age_of_housing_units : int 56 40 26 29 57 26 33 29 54 41 ...
dim(hmda_ca)
## [1] 50000 99
xkabledplyhead(hmda_ca,5)
| activity_year | lei | derived_msa.md | state_code | county_code | census_tract | conforming_loan_limit | derived_loan_product_type | derived_dwelling_category | derived_ethnicity | derived_race | derived_sex | action_taken | purchaser_type | preapproval | loan_type | loan_purpose | lien_status | reverse_mortgage | open.end_line_of_credit | business_or_commercial_purpose | loan_amount | loan_to_value_ratio | interest_rate | rate_spread | hoepa_status | total_loan_costs | total_points_and_fees | origination_charges | discount_points | lender_credits | loan_term | prepayment_penalty_term | intro_rate_period | negative_amortization | interest_only_payment | balloon_payment | other_nonamortizing_features | property_value | construction_method | occupancy_type | manufactured_home_secured_property_type | manufactured_home_land_property_interest | total_units | multifamily_affordable_units | income | debt_to_income_ratio | applicant_credit_score_type | co.applicant_credit_score_type | applicant_ethnicity.1 | applicant_ethnicity.2 | applicant_ethnicity.3 | applicant_ethnicity.4 | applicant_ethnicity.5 | co.applicant_ethnicity.1 | co.applicant_ethnicity.2 | co.applicant_ethnicity.3 | co.applicant_ethnicity.4 | co.applicant_ethnicity.5 | applicant_ethnicity_observed | co.applicant_ethnicity_observed | applicant_race.1 | applicant_race.2 | applicant_race.3 | applicant_race.4 | applicant_race.5 | co.applicant_race.1 | co.applicant_race.2 | co.applicant_race.3 | co.applicant_race.4 | co.applicant_race.5 | applicant_race_observed | co.applicant_race_observed | applicant_sex | co.applicant_sex | applicant_sex_observed | co.applicant_sex_observed | applicant_age | co.applicant_age | applicant_age_above_62 | co.applicant_age_above_62 | submission_of_application | initially_payable_to_institution | aus.1 | aus.2 | aus.3 | aus.4 | aus.5 | denial_reason.1 | denial_reason.2 | denial_reason.3 | denial_reason.4 | tract_population | tract_minority_population_percent | ffiec_msa_md_median_family_income | tract_to_msa_income_percentage | tract_owner_occupied_units | tract_one_to_four_family_homes | tract_median_age_of_housing_units |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2019 | 549300HN58ONH5KNJJ12 | 31084 | CA | 6037 | 6.04e+09 | NC | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | White | Male | 1 | 9 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 1125000 | 80 | 5.25 | NA | 2 | 16765 | NA | 11200 | NA | NA | 360 | NA | NA | 2 | 2 | 2 | 2 | 1405000 | 1 | 1 | 3 | 5 | 1 | NA | 432 | 20%-<30% | 2 | 10 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 2 | 4 | 5 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 2 | 4 | 1 | 5 | 2 | 4 | 35-44 | 9999 | No | NA | 1 | 2 | 6 | NA | NA | NA | NA | 10 | NA | NA | NA | 4162 | 16.2 | 73100 | 249 | 1369 | 1635 | 56 |
| 2019 | 549300ZPP46PTHQ2RF22 | 42220 | CA | 6097 | 6.10e+09 | NC | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | White | Male | 1 | 71 | 2 | 1 | 32 | 1 | 1111 | 1111 | 1111 | 875000 | Exempt | Exempt | Exempt | 2 | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | 1111 | 1111 | 1111 | 1111 | Exempt | 1 | 1 | 1111 | 1111 | 1 | Exempt | 254 | Exempt | 1111 | 1111 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 1 | 4 | 5 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 1 | 4 | 1 | 5 | 1 | 4 | 65-74 | 9999 | Yes | NA | 1111 | 1111 | 1111 | NA | NA | NA | NA | 1111 | NA | NA | NA | 4094 | 20.1 | 93300 | 124 | 1211 | 1801 | 40 |
| 2019 | 549300ZPP46PTHQ2RF22 | 42220 | CA | 6097 | 6.10e+09 | C | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | White | Male | 1 | 71 | 2 | 1 | 1 | 1 | 1111 | 1111 | 1111 | 375000 | Exempt | Exempt | Exempt | 2 | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | 1111 | 1111 | 1111 | 1111 | Exempt | 1 | 1 | 1111 | 1111 | 1 | Exempt | 109 | Exempt | 1111 | 1111 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 3 | 4 | 5 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 3 | 4 | 1 | 5 | 3 | 4 | 35-44 | 9999 | No | NA | 1111 | 1111 | 1111 | NA | NA | NA | NA | 1111 | NA | NA | NA | 12736 | 58.6 | 93300 | 79 | 1960 | 3422 | 26 |
| 2019 | 549300ZPP46PTHQ2RF22 | 42220 | CA | 6097 | 6.10e+09 | C | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | Race Not Available | Female | 1 | 71 | 2 | 1 | 1 | 1 | 1111 | 1111 | 1111 | 605000 | Exempt | Exempt | Exempt | 2 | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | 1111 | 1111 | 1111 | 1111 | Exempt | 1 | 1 | 1111 | 1111 | 1 | Exempt | 134 | Exempt | 1111 | 1111 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 3 | 4 | 6 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 3 | 4 | 2 | 5 | 3 | 4 | 25-34 | 9999 | No | NA | 1111 | 1111 | 1111 | NA | NA | NA | NA | 1111 | NA | NA | NA | 5746 | 33.8 | 93300 | 118 | 1430 | 2173 | 29 |
| 2019 | 549300ZPP46PTHQ2RF22 | 42220 | CA | 6097 | 6.10e+09 | NC | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | White | Male | 1 | 71 | 2 | 1 | 32 | 1 | 1111 | 1111 | 1111 | 1335000 | Exempt | Exempt | Exempt | 2 | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | Exempt | 1111 | 1111 | 1111 | 1111 | Exempt | 1 | 1 | 1111 | 1111 | 1 | Exempt | 363 | Exempt | 1111 | 1111 | 2 | NA | NA | NA | NA | 2 | NA | NA | NA | NA | 3 | 3 | 5 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 3 | 3 | 1 | 1 | 3 | 3 | 55-64 | 55-64 | No | No | 1111 | 1111 | 1111 | NA | NA | NA | NA | 1111 | NA | NA | NA | 4286 | 14.6 | 93300 | 94 | 1435 | 2459 | 57 |
xkabledplytail(hmda_ca,5)
| activity_year | lei | derived_msa.md | state_code | county_code | census_tract | conforming_loan_limit | derived_loan_product_type | derived_dwelling_category | derived_ethnicity | derived_race | derived_sex | action_taken | purchaser_type | preapproval | loan_type | loan_purpose | lien_status | reverse_mortgage | open.end_line_of_credit | business_or_commercial_purpose | loan_amount | loan_to_value_ratio | interest_rate | rate_spread | hoepa_status | total_loan_costs | total_points_and_fees | origination_charges | discount_points | lender_credits | loan_term | prepayment_penalty_term | intro_rate_period | negative_amortization | interest_only_payment | balloon_payment | other_nonamortizing_features | property_value | construction_method | occupancy_type | manufactured_home_secured_property_type | manufactured_home_land_property_interest | total_units | multifamily_affordable_units | income | debt_to_income_ratio | applicant_credit_score_type | co.applicant_credit_score_type | applicant_ethnicity.1 | applicant_ethnicity.2 | applicant_ethnicity.3 | applicant_ethnicity.4 | applicant_ethnicity.5 | co.applicant_ethnicity.1 | co.applicant_ethnicity.2 | co.applicant_ethnicity.3 | co.applicant_ethnicity.4 | co.applicant_ethnicity.5 | applicant_ethnicity_observed | co.applicant_ethnicity_observed | applicant_race.1 | applicant_race.2 | applicant_race.3 | applicant_race.4 | applicant_race.5 | co.applicant_race.1 | co.applicant_race.2 | co.applicant_race.3 | co.applicant_race.4 | co.applicant_race.5 | applicant_race_observed | co.applicant_race_observed | applicant_sex | co.applicant_sex | applicant_sex_observed | co.applicant_sex_observed | applicant_age | co.applicant_age | applicant_age_above_62 | co.applicant_age_above_62 | submission_of_application | initially_payable_to_institution | aus.1 | aus.2 | aus.3 | aus.4 | aus.5 | denial_reason.1 | denial_reason.2 | denial_reason.3 | denial_reason.4 | tract_population | tract_minority_population_percent | ffiec_msa_md_median_family_income | tract_to_msa_income_percentage | tract_owner_occupied_units | tract_one_to_four_family_homes | tract_median_age_of_housing_units | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 49996 | 2019 | 5493005LE3FE2MIPPW66 | 31084 | CA | 6037 | 6.04e+09 | C | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | Black or African American | Male | 1 | 6 | 2 | 1 | 31 | 1 | 2 | 2 | 2 | 85000 | 15.385 | 4.875 | 0.882 | 2 | 3169 | NA | 1760 | NA | NA | 360 | NA | NA | 2 | 2 | 2 | 2 | 525000 | 1 | 1 | 3 | 5 | 1 | NA | 30 | 36 | 7 | 10 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 1 | 4 | 3 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 1 | 4 | 1 | 5 | 1 | 4 | 55-64 | 9999 | No | NA | 1 | 1 | 1 | NA | NA | NA | NA | 10 | NA | NA | NA | 5569 | 79.3 | 73100 | 129 | 1100 | 1330 | 59 |
| 49997 | 2019 | 5493005LE3FE2MIPPW66 | 36084 | CA | 6013 | 6.01e+09 | C | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | Asian | Joint | 1 | 6 | 2 | 1 | 32 | 1 | 2 | 2 | 2 | 615000 | 73.653 | 4.875 | 0.588 | 2 | 4054 | NA | 1760 | NA | 3834 | 360 | NA | NA | 2 | 2 | 2 | 2 | 835000 | 1 | 1 | 3 | 5 | 1 | NA | 232 | 20%-<30% | 7 | 7 | 2 | NA | NA | NA | NA | 2 | NA | NA | NA | NA | 2 | 2 | 2 | 21 | NA | NA | NA | 2 | 21 | NA | NA | NA | 2 | 2 | 1 | 2 | 2 | 2 | 45-54 | 35-44 | No | No | 1 | 1 | 1 | NA | NA | NA | NA | 10 | NA | NA | NA | 7492 | 80.7 | 111700 | 214 | 1682 | 1892 | 9 |
| 49998 | 2019 | 5493005LE3FE2MIPPW66 | 31084 | CA | 6037 | 6.04e+09 | C | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | Black or African American | Male | 1 | 6 | 2 | 1 | 32 | 1 | 2 | 2 | 2 | 385000 | 79.835 | 4.875 | 0.854 | 2 | 12654 | NA | 10490 | 8730 | NA | 360 | NA | NA | 2 | 2 | 2 | 2 | 485000 | 1 | 1 | 3 | 5 | 1 | NA | 83 | 38 | 7 | 10 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 1 | 4 | 3 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 1 | 4 | 1 | 5 | 1 | 4 | 55-64 | 9999 | No | NA | 1 | 1 | 1 | NA | NA | NA | NA | 10 | NA | NA | NA | 4676 | 97.0 | 73100 | 73 | 389 | 877 | 63 |
| 49999 | 2019 | 5493005LE3FE2MIPPW66 | 33700 | CA | 6099 | 6.10e+09 | C | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | White | Male | 1 | 6 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 135000 | 80 | 5.49 | 1.4 | 2 | 3708.5 | NA | 1760 | NA | NA | 360 | NA | NA | 2 | 2 | 2 | 2 | 175000 | 1 | 1 | 3 | 5 | 1 | NA | 60 | 30%-<36% | 7 | 10 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 2 | 4 | 5 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 2 | 4 | 1 | 5 | 2 | 4 | 25-34 | 9999 | No | NA | 1 | 1 | 1 | NA | NA | NA | NA | 10 | NA | NA | NA | 4824 | 47.5 | 64500 | 74 | 788 | 1396 | 49 |
| 50000 | 2019 | 5493005LE3FE2MIPPW66 | 11244 | CA | 6059 | 6.06e+09 | C | Conventional:First Lien | Single Family (1-4 Units):Site-Built | Not Hispanic or Latino | White | Male | 1 | 71 | 2 | 1 | 32 | 1 | 2 | 2 | 2 | 155000 | 24.59 | 5.25 | 1.256 | 2 | 6108 | NA | 4039 | 2279 | NA | 360 | NA | NA | 2 | 2 | 2 | 2 | 615000 | 1 | 1 | 3 | 5 | 1 | NA | 66 | <20% | 7 | 10 | 2 | NA | NA | NA | NA | 5 | NA | NA | NA | NA | 1 | 4 | 5 | NA | NA | NA | NA | 8 | NA | NA | NA | NA | 1 | 4 | 1 | 5 | 1 | 4 | 65-74 | 9999 | Yes | NA | 1 | 1 | 1 | NA | NA | NA | NA | 10 | NA | NA | NA | 5706 | 70.4 | 97900 | 65 | 734 | 1117 | 51 |
hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)
## 'data.frame': 48030 obs. of 99 variables:
## $ activity_year : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
## $ lei : chr "549300HN58ONH5KNJJ12" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" ...
## $ derived_msa.md : int 31084 99999 23420 40140 40140 31084 41940 99999 23420 36084 ...
## $ state_code : chr "CA" "CA" "CA" "CA" ...
## $ county_code : int 6037 6033 6019 6065 6071 6037 6085 6105 6019 6013 ...
## $ census_tract : num 6.04e+09 6.03e+09 6.02e+09 6.07e+09 6.07e+09 ...
## $ conforming_loan_limit : chr "NC" "C" "C" "C" ...
## $ derived_loan_product_type : chr "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" ...
## $ derived_dwelling_category : chr "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Manufactured" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" ...
## $ derived_ethnicity : chr "Not Hispanic or Latino" "Ethnicity Not Available" "Hispanic or Latino" "Hispanic or Latino" ...
## $ derived_race : chr "White" "White" "Race Not Available" "White" ...
## $ derived_sex : chr "Male" "Male" "Joint" "Male" ...
## $ action_taken : int 1 4 1 1 3 1 2 3 4 1 ...
## $ purchaser_type : int 9 0 0 0 0 0 0 0 0 0 ...
## $ preapproval : int 1 2 2 2 2 2 2 2 2 2 ...
## $ loan_type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ loan_purpose : int 1 4 31 31 1 32 32 31 1 31 ...
## $ lien_status : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reverse_mortgage : int 2 2 2 2 2 2 2 2 2 2 ...
## $ open.end_line_of_credit : int 2 2 2 2 2 2 2 2 2 2 ...
## $ business_or_commercial_purpose : int 2 2 2 2 2 2 2 2 2 2 ...
## $ loan_amount : int 1125000 165000 175000 315000 115000 165000 575000 115000 205000 265000 ...
## $ loan_to_value_ratio : chr "80" NA "74.09" "74.73" ...
## $ interest_rate : chr "5.25" NA "7" "6.625" ...
## $ rate_spread : chr NA NA "2.45" "2.054" ...
## $ hoepa_status : int 2 3 2 2 3 2 3 3 3 2 ...
## $ total_loan_costs : chr "16765" NA "6121.9" "12961.2" ...
## $ total_points_and_fees : chr NA NA NA NA ...
## $ origination_charges : chr "11200" NA "4535" "10985" ...
## $ discount_points : chr NA NA NA NA ...
## $ lender_credits : chr NA NA "500" "650" ...
## $ loan_term : chr "360" "240" "360" "360" ...
## $ prepayment_penalty_term : chr NA NA NA NA ...
## $ intro_rate_period : chr NA NA NA NA ...
## $ negative_amortization : int 2 2 2 2 2 2 2 2 2 2 ...
## $ interest_only_payment : int 2 2 2 2 2 2 2 2 2 2 ...
## $ balloon_payment : int 2 2 2 2 2 2 2 2 2 2 ...
## $ other_nonamortizing_features : int 2 2 2 2 2 2 2 2 2 2 ...
## $ property_value : chr "1405000" NA "235000" "415000" ...
## $ construction_method : int 1 2 1 1 1 1 1 1 1 1 ...
## $ occupancy_type : int 1 1 1 1 1 1 1 2 1 1 ...
## $ manufactured_home_secured_property_type : int 3 1 3 3 3 3 3 3 3 3 ...
## $ manufactured_home_land_property_interest: int 5 1 5 5 5 5 5 5 5 5 ...
## $ total_units : chr "1" "1" "1" "1" ...
## $ multifamily_affordable_units : chr NA NA NA NA ...
## $ income : int 432 44 72 146 42 64 129 66 59 48 ...
## $ debt_to_income_ratio : chr "20%-<30%" NA "20%-<30%" "20%-<30%" ...
## $ applicant_credit_score_type : int 2 9 3 3 3 3 3 3 9 3 ...
## $ co.applicant_credit_score_type : int 10 9 9 10 9 9 9 10 9 10 ...
## $ applicant_ethnicity.1 : int 2 3 1 1 1 1 1 2 1 1 ...
## $ applicant_ethnicity.2 : int NA NA 11 NA 11 11 11 NA NA NA ...
## $ applicant_ethnicity.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_ethnicity.4 : logi NA NA NA NA NA NA ...
## $ applicant_ethnicity.5 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.1 : int 5 5 1 5 1 1 1 5 5 5 ...
## $ co.applicant_ethnicity.2 : int NA NA 11 NA 11 11 11 NA NA NA ...
## $ co.applicant_ethnicity.3 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.4 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.5 : logi NA NA NA NA NA NA ...
## $ applicant_ethnicity_observed : int 2 2 2 2 2 2 2 2 2 2 ...
## $ co.applicant_ethnicity_observed : int 4 4 2 4 2 2 2 4 4 4 ...
## $ applicant_race.1 : int 5 5 6 5 5 5 5 2 5 5 ...
## $ applicant_race.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.5 : logi NA NA NA NA NA NA ...
## $ co.applicant_race.1 : int 8 8 6 8 5 5 5 8 8 8 ...
## $ co.applicant_race.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ co.applicant_race.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ co.applicant_race.4 : logi NA NA NA NA NA NA ...
## $ co.applicant_race.5 : logi NA NA NA NA NA NA ...
## $ applicant_race_observed : int 2 2 2 2 2 2 2 2 2 2 ...
## $ co.applicant_race_observed : int 4 4 2 4 2 2 2 4 4 4 ...
## $ applicant_sex : int 1 1 1 1 1 2 1 1 1 1 ...
## $ co.applicant_sex : int 5 5 2 5 2 2 1 5 5 5 ...
## $ applicant_sex_observed : int 2 2 2 2 2 2 2 2 2 2 ...
## $ co.applicant_sex_observed : int 4 4 2 4 2 2 2 4 4 4 ...
## $ applicant_age : chr "35-44" "35-44" "35-44" "35-44" ...
## $ co.applicant_age : chr "9999" "9999" "35-44" "9999" ...
## $ applicant_age_above_62 : chr "No" "No" "No" "No" ...
## $ co.applicant_age_above_62 : chr NA NA "No" NA ...
## $ submission_of_application : int 1 1 1 2 2 2 2 1 2 2 ...
## $ initially_payable_to_institution : int 2 1 1 1 1 1 1 1 1 1 ...
## $ aus.1 : int 6 6 6 6 6 6 6 6 6 6 ...
## $ aus.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.5 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.1 : int 10 10 10 10 4 10 10 4 10 10 ...
## $ denial_reason.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.4 : logi NA NA NA NA NA NA ...
## $ tract_population : int 4162 3516 5493 5344 6857 7118 6848 2628 5321 6235 ...
## $ tract_minority_population_percent : num 16.2 21.7 83.8 47.4 66.7 ...
## $ ffiec_msa_md_median_family_income : int 73100 64800 57300 69700 69700 73100 129900 64800 57300 111700 ...
## $ tract_to_msa_income_percentage : int 249 100 74 130 61 78 108 89 151 43 ...
## $ tract_owner_occupied_units : int 1369 1147 540 1324 1129 1102 1844 876 1355 518 ...
## $ tract_one_to_four_family_homes : int 1635 1663 1255 1826 2463 1540 2125 1908 1612 1575 ...
## $ tract_median_age_of_housing_units : int 56 32 48 10 54 54 49 39 13 17 ...
#rename(hmda_ca, ethnicity=derived_ethnicity, race=derived_race,
#sex=derived_sex)
hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)
## 'data.frame': 48030 obs. of 99 variables:
## $ activity_year : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
## $ lei : chr "549300HN58ONH5KNJJ12" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" ...
## $ derived_msa.md : int 31084 99999 23420 40140 40140 31084 41940 99999 23420 36084 ...
## $ state_code : chr "CA" "CA" "CA" "CA" ...
## $ county_code : int 6037 6033 6019 6065 6071 6037 6085 6105 6019 6013 ...
## $ census_tract : num 6.04e+09 6.03e+09 6.02e+09 6.07e+09 6.07e+09 ...
## $ conforming_loan_limit : chr "NC" "C" "C" "C" ...
## $ derived_loan_product_type : chr "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" ...
## $ derived_dwelling_category : chr "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Manufactured" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" ...
## $ derived_ethnicity : chr "Not Hispanic or Latino" "Ethnicity Not Available" "Hispanic or Latino" "Hispanic or Latino" ...
## $ derived_race : chr "White" "White" "Race Not Available" "White" ...
## $ derived_sex : chr "Male" "Male" "Joint" "Male" ...
## $ action_taken : int 1 4 1 1 3 1 2 3 4 1 ...
## $ purchaser_type : int 9 0 0 0 0 0 0 0 0 0 ...
## $ preapproval : int 1 2 2 2 2 2 2 2 2 2 ...
## $ loan_type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ loan_purpose : int 1 4 31 31 1 32 32 31 1 31 ...
## $ lien_status : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reverse_mortgage : int 2 2 2 2 2 2 2 2 2 2 ...
## $ open.end_line_of_credit : int 2 2 2 2 2 2 2 2 2 2 ...
## $ business_or_commercial_purpose : int 2 2 2 2 2 2 2 2 2 2 ...
## $ loan_amount : int 1125000 165000 175000 315000 115000 165000 575000 115000 205000 265000 ...
## $ loan_to_value_ratio : chr "80" NA "74.09" "74.73" ...
## $ interest_rate : chr "5.25" NA "7" "6.625" ...
## $ rate_spread : chr NA NA "2.45" "2.054" ...
## $ hoepa_status : int 2 3 2 2 3 2 3 3 3 2 ...
## $ total_loan_costs : chr "16765" NA "6121.9" "12961.2" ...
## $ total_points_and_fees : chr NA NA NA NA ...
## $ origination_charges : chr "11200" NA "4535" "10985" ...
## $ discount_points : chr NA NA NA NA ...
## $ lender_credits : chr NA NA "500" "650" ...
## $ loan_term : chr "360" "240" "360" "360" ...
## $ prepayment_penalty_term : chr NA NA NA NA ...
## $ intro_rate_period : chr NA NA NA NA ...
## $ negative_amortization : int 2 2 2 2 2 2 2 2 2 2 ...
## $ interest_only_payment : int 2 2 2 2 2 2 2 2 2 2 ...
## $ balloon_payment : int 2 2 2 2 2 2 2 2 2 2 ...
## $ other_nonamortizing_features : int 2 2 2 2 2 2 2 2 2 2 ...
## $ property_value : chr "1405000" NA "235000" "415000" ...
## $ construction_method : int 1 2 1 1 1 1 1 1 1 1 ...
## $ occupancy_type : int 1 1 1 1 1 1 1 2 1 1 ...
## $ manufactured_home_secured_property_type : int 3 1 3 3 3 3 3 3 3 3 ...
## $ manufactured_home_land_property_interest: int 5 1 5 5 5 5 5 5 5 5 ...
## $ total_units : chr "1" "1" "1" "1" ...
## $ multifamily_affordable_units : chr NA NA NA NA ...
## $ income : int 432 44 72 146 42 64 129 66 59 48 ...
## $ debt_to_income_ratio : chr "20%-<30%" NA "20%-<30%" "20%-<30%" ...
## $ applicant_credit_score_type : int 2 9 3 3 3 3 3 3 9 3 ...
## $ co.applicant_credit_score_type : int 10 9 9 10 9 9 9 10 9 10 ...
## $ applicant_ethnicity.1 : int 2 3 1 1 1 1 1 2 1 1 ...
## $ applicant_ethnicity.2 : int NA NA 11 NA 11 11 11 NA NA NA ...
## $ applicant_ethnicity.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_ethnicity.4 : logi NA NA NA NA NA NA ...
## $ applicant_ethnicity.5 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.1 : int 5 5 1 5 1 1 1 5 5 5 ...
## $ co.applicant_ethnicity.2 : int NA NA 11 NA 11 11 11 NA NA NA ...
## $ co.applicant_ethnicity.3 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.4 : logi NA NA NA NA NA NA ...
## $ co.applicant_ethnicity.5 : logi NA NA NA NA NA NA ...
## $ applicant_ethnicity_observed : int 2 2 2 2 2 2 2 2 2 2 ...
## $ co.applicant_ethnicity_observed : int 4 4 2 4 2 2 2 4 4 4 ...
## $ applicant_race.1 : int 5 5 6 5 5 5 5 2 5 5 ...
## $ applicant_race.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ applicant_race.5 : logi NA NA NA NA NA NA ...
## $ co.applicant_race.1 : int 8 8 6 8 5 5 5 8 8 8 ...
## $ co.applicant_race.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ co.applicant_race.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ co.applicant_race.4 : logi NA NA NA NA NA NA ...
## $ co.applicant_race.5 : logi NA NA NA NA NA NA ...
## $ applicant_race_observed : int 2 2 2 2 2 2 2 2 2 2 ...
## $ co.applicant_race_observed : int 4 4 2 4 2 2 2 4 4 4 ...
## $ applicant_sex : int 1 1 1 1 1 2 1 1 1 1 ...
## $ co.applicant_sex : int 5 5 2 5 2 2 1 5 5 5 ...
## $ applicant_sex_observed : int 2 2 2 2 2 2 2 2 2 2 ...
## $ co.applicant_sex_observed : int 4 4 2 4 2 2 2 4 4 4 ...
## $ applicant_age : chr "35-44" "35-44" "35-44" "35-44" ...
## $ co.applicant_age : chr "9999" "9999" "35-44" "9999" ...
## $ applicant_age_above_62 : chr "No" "No" "No" "No" ...
## $ co.applicant_age_above_62 : chr NA NA "No" NA ...
## $ submission_of_application : int 1 1 1 2 2 2 2 1 2 2 ...
## $ initially_payable_to_institution : int 2 1 1 1 1 1 1 1 1 1 ...
## $ aus.1 : int 6 6 6 6 6 6 6 6 6 6 ...
## $ aus.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ aus.5 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.1 : int 10 10 10 10 4 10 10 4 10 10 ...
## $ denial_reason.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ denial_reason.4 : logi NA NA NA NA NA NA ...
## $ tract_population : int 4162 3516 5493 5344 6857 7118 6848 2628 5321 6235 ...
## $ tract_minority_population_percent : num 16.2 21.7 83.8 47.4 66.7 ...
## $ ffiec_msa_md_median_family_income : int 73100 64800 57300 69700 69700 73100 129900 64800 57300 111700 ...
## $ tract_to_msa_income_percentage : int 249 100 74 130 61 78 108 89 151 43 ...
## $ tract_owner_occupied_units : int 1369 1147 540 1324 1129 1102 1844 876 1355 518 ...
## $ tract_one_to_four_family_homes : int 1635 1663 1255 1826 2463 1540 2125 1908 1612 1575 ...
## $ tract_median_age_of_housing_units : int 56 32 48 10 54 54 49 39 13 17 ...
hmda_ca <- subset(hmda_ca,occupancy_type=="1")
dim(hmda_ca) #45735 99
## [1] 45735 99
#xkabledplyhead(hmda_ca,5)
#xkabledplytail(hmda_ca,5)
loadPkg("sqldf")
names(hmda_ca)
## [1] "activity_year"
## [2] "lei"
## [3] "derived_msa.md"
## [4] "state_code"
## [5] "county_code"
## [6] "census_tract"
## [7] "conforming_loan_limit"
## [8] "derived_loan_product_type"
## [9] "derived_dwelling_category"
## [10] "derived_ethnicity"
## [11] "derived_race"
## [12] "derived_sex"
## [13] "action_taken"
## [14] "purchaser_type"
## [15] "preapproval"
## [16] "loan_type"
## [17] "loan_purpose"
## [18] "lien_status"
## [19] "reverse_mortgage"
## [20] "open.end_line_of_credit"
## [21] "business_or_commercial_purpose"
## [22] "loan_amount"
## [23] "loan_to_value_ratio"
## [24] "interest_rate"
## [25] "rate_spread"
## [26] "hoepa_status"
## [27] "total_loan_costs"
## [28] "total_points_and_fees"
## [29] "origination_charges"
## [30] "discount_points"
## [31] "lender_credits"
## [32] "loan_term"
## [33] "prepayment_penalty_term"
## [34] "intro_rate_period"
## [35] "negative_amortization"
## [36] "interest_only_payment"
## [37] "balloon_payment"
## [38] "other_nonamortizing_features"
## [39] "property_value"
## [40] "construction_method"
## [41] "occupancy_type"
## [42] "manufactured_home_secured_property_type"
## [43] "manufactured_home_land_property_interest"
## [44] "total_units"
## [45] "multifamily_affordable_units"
## [46] "income"
## [47] "debt_to_income_ratio"
## [48] "applicant_credit_score_type"
## [49] "co.applicant_credit_score_type"
## [50] "applicant_ethnicity.1"
## [51] "applicant_ethnicity.2"
## [52] "applicant_ethnicity.3"
## [53] "applicant_ethnicity.4"
## [54] "applicant_ethnicity.5"
## [55] "co.applicant_ethnicity.1"
## [56] "co.applicant_ethnicity.2"
## [57] "co.applicant_ethnicity.3"
## [58] "co.applicant_ethnicity.4"
## [59] "co.applicant_ethnicity.5"
## [60] "applicant_ethnicity_observed"
## [61] "co.applicant_ethnicity_observed"
## [62] "applicant_race.1"
## [63] "applicant_race.2"
## [64] "applicant_race.3"
## [65] "applicant_race.4"
## [66] "applicant_race.5"
## [67] "co.applicant_race.1"
## [68] "co.applicant_race.2"
## [69] "co.applicant_race.3"
## [70] "co.applicant_race.4"
## [71] "co.applicant_race.5"
## [72] "applicant_race_observed"
## [73] "co.applicant_race_observed"
## [74] "applicant_sex"
## [75] "co.applicant_sex"
## [76] "applicant_sex_observed"
## [77] "co.applicant_sex_observed"
## [78] "applicant_age"
## [79] "co.applicant_age"
## [80] "applicant_age_above_62"
## [81] "co.applicant_age_above_62"
## [82] "submission_of_application"
## [83] "initially_payable_to_institution"
## [84] "aus.1"
## [85] "aus.2"
## [86] "aus.3"
## [87] "aus.4"
## [88] "aus.5"
## [89] "denial_reason.1"
## [90] "denial_reason.2"
## [91] "denial_reason.3"
## [92] "denial_reason.4"
## [93] "tract_population"
## [94] "tract_minority_population_percent"
## [95] "ffiec_msa_md_median_family_income"
## [96] "tract_to_msa_income_percentage"
## [97] "tract_owner_occupied_units"
## [98] "tract_one_to_four_family_homes"
## [99] "tract_median_age_of_housing_units"
sqldf("select count(distinct(county_code)) from hmda_ca")
## count(distinct(county_code))
## 1 59
unloadPkg("sqldf")
hmda_ca1<-hmda_ca%>%filter(action_taken %in% c("1", "3"))
hmda_ca<-hmda_ca1
dim(hmda_ca) #30661 99
## [1] 30661 99
hmda_ca_final <- hmda_ca[c(10,11,12,13,22,24,39,46,50,62,74,78)]
str(hmda_ca_final)
## 'data.frame': 30661 obs. of 12 variables:
## $ derived_ethnicity : chr "Not Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" ...
## $ derived_race : chr "White" "Race Not Available" "White" "White" ...
## $ derived_sex : chr "Male" "Joint" "Male" "Joint" ...
## $ action_taken : int 1 1 1 3 1 1 1 1 3 1 ...
## $ loan_amount : int 1125000 175000 315000 115000 165000 265000 445000 335000 315000 285000 ...
## $ interest_rate : chr "5.25" "7" "6.625" NA ...
## $ property_value : chr "1405000" "235000" "415000" "145000" ...
## $ income : int 432 72 146 42 64 48 101 125 71 78 ...
## $ applicant_ethnicity.1: int 2 1 1 1 1 1 3 1 1 1 ...
## $ applicant_race.1 : int 5 6 5 5 5 5 6 5 5 5 ...
## $ applicant_sex : int 1 1 1 1 2 1 2 1 1 1 ...
## $ applicant_age : chr "35-44" "35-44" "35-44" "25-34" ...
hmda_ca_final_1 = hmda_ca_final
hmda_ca_final_1$derived_ethnicity = factor(hmda_ca_final$derived_ethnicity)
hmda_ca_final_1$derived_race = factor(hmda_ca_final$derived_race)
hmda_ca_final_1$derived_sex = factor(hmda_ca_final$derived_sex)
#hmda_ca_final_1$action_taken = factor(hmda_ca_final$action_taken)
hmda_ca_final_1$loan_amount = as.numeric(hmda_ca_final$loan_amount)
hmda_ca_final_1$interest_rate = as.numeric(hmda_ca_final$interest_rate)
hmda_ca_final_1$property_value = as.numeric(hmda_ca_final$property_value)
hmda_ca_final_1$income = as.numeric(hmda_ca_final$income)
hmda_ca_final_1$applicant_age = factor(hmda_ca_final$applicant_age)
str(hmda_ca_final_1)
## 'data.frame': 30661 obs. of 12 variables:
## $ derived_ethnicity : Factor w/ 4 levels "Ethnicity Not Available",..: 4 2 2 2 2 2 1 2 2 2 ...
## $ derived_race : Factor w/ 8 levels "2 or more minority races",..: 8 7 8 8 8 8 7 8 8 8 ...
## $ derived_sex : Factor w/ 4 levels "Female","Joint",..: 3 2 3 2 1 3 1 3 3 3 ...
## $ action_taken : int 1 1 1 3 1 1 1 1 3 1 ...
## $ loan_amount : num 1125000 175000 315000 115000 165000 ...
## $ interest_rate : num 5.25 7 6.62 NA 6.75 ...
## $ property_value : num 1405000 235000 415000 145000 435000 ...
## $ income : num 432 72 146 42 64 48 101 125 71 78 ...
## $ applicant_ethnicity.1: int 2 1 1 1 1 1 3 1 1 1 ...
## $ applicant_race.1 : int 5 6 5 5 5 5 6 5 5 5 ...
## $ applicant_sex : int 1 1 1 1 2 1 2 1 1 1 ...
## $ applicant_age : Factor w/ 8 levels "<25",">74","25-34",..: 4 4 4 3 4 4 4 5 5 4 ...
#Examine and filter out the missing values
missvalue <- is.na(hmda_ca_final_1)
summary(missvalue)
## derived_ethnicity derived_race derived_sex action_taken
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:30661 FALSE:30661 FALSE:30661 FALSE:30661
##
## loan_amount interest_rate property_value income
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:30661 FALSE:23742 FALSE:28927 FALSE:29359
## TRUE :6919 TRUE :1734 TRUE :1302
## applicant_ethnicity.1 applicant_race.1 applicant_sex applicant_age
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:30661 FALSE:30661 FALSE:30661 FALSE:30661
##
#There are 30,661 observations after we've filtered on all of the relevant fields.
#Missing values are present in interest_rate: 6,919; property_value: 1,734, and income: 1,302.
hmda_ca_final_2 <- hmda_ca_final_1$interest_rate[is.na(hmda_ca_final_1$interest_rate)] <- mean(hmda_ca_final_1$interest_rate, na.rm = TRUE)
hmda_ca_final_2 <- hmda_ca_final_1 %>% drop_na(applicant_ethnicity.1)
hmda_ca_final_2 <- na.omit(hmda_ca_final_1, cols="income")
missvalue1 <- is.na(hmda_ca_final_2)
summary(missvalue1)
## derived_ethnicity derived_race derived_sex action_taken
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:28695 FALSE:28695 FALSE:28695 FALSE:28695
## loan_amount interest_rate property_value income
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:28695 FALSE:28695 FALSE:28695 FALSE:28695
## applicant_ethnicity.1 applicant_race.1 applicant_sex applicant_age
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:28695 FALSE:28695 FALSE:28695 FALSE:28695
#After cleaning out the missing values, we are left with 28,695 observations
#Exploring the categorical values first
library(ggplot2)
ggplot(hmda_ca_final_2, aes(x = factor(derived_sex))) +
geom_bar(color="black", fill="antiquewhite2")+
labs(title="Graph 1. Applicant sex distribution", x="Applicant Sex", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(applicant_age))) +
geom_bar(color="black", fill="bisque3")+
labs(title="Graph 2. Applicant age distribution", x="Applicant Age", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(derived_ethnicity))) +
geom_bar(color="black", fill="cornsilk3")+
labs(title="Graph 3. Applicant ethnicity distribution", x="Applicant Ethnicity", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(action_taken))) +
geom_bar(color="black", fill="azure3")+
labs(title="Graph 4. Action taken distribution", x="Action Taken", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(derived_race))) +
geom_bar(color="black", fill="azure3")+
labs(title="Graph 5. Derived Race", x="Race of the Applicant", y= "Count")+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Next, we proceed to exploring the numerical data for normality and outliers. Graphs 8-10 indicate that none of the numerical variables are normally distributed.
ggplot(hmda_ca_final_2,aes(x=loan_amount))+
geom_histogram(color="black", fill="steelblue")+
labs(title=" Graph 6. Histogram of Loan Amount", x="Loan Amount in dollars", y="Frequency")+theme_bw()
ggplot(hmda_ca_final_2,aes(x=interest_rate))+
geom_histogram(color="black", fill="pink")+
labs(title="Graph 8. Histogram of Interest Rate", x="Interest Rates", y="Frequency")+theme_bw()
ggplot(hmda_ca_final_2,aes(x=income))+
geom_histogram(color="black", fill="azure3")+
labs(title="Graph 9. Histogram of Income", x="Income in dollars", y="Frequency")+theme_bw()
ggplot(hmda_ca_final_2,aes(x=property_value))+
geom_histogram(color="black", fill="lightblue")+
labs(title="Graph 10. Histogram of Property Values", x="Property Values in dollars", y="Frequency")+theme_bw()
As another check, we inspect qqnorm plots and confirm that the distributions are not normal(Graph 11-14)
qqnorm(hmda_ca_final_2$interest_rate,
main="Graph 11. QQ Plot of Interest Rates",
ylab="Interest Rate",
col="pink")
qqline(hmda_ca_final_2$interest_rate)
qqnorm(hmda_ca_final_2$income,
main="Graph 13. QQ Plot of Income",
ylab="Income",
col="green")
qqline(hmda_ca_final_2$income)
qqnorm(hmda_ca_final_2$property_value,
main="Graph 12. QQ Plot of Property Values",
ylab="Property Value",
col="blue")
qqline(hmda_ca_final_2$property_value)
qqnorm(hmda_ca_final_2$loan_amount,
main="Graph 14.QQ Plot of Loan Amount",
ylab="Loan Amount",
col="purple")
qqline(hmda_ca_final_1$loan_amount)
Next, we remove outliers and check for normality again.
hmda_ca_final_no_outliers_3 <- outlierKD2(hmda_ca_final_2,interest_rate)
## Outliers identified: 2664
## Propotion (%) of outliers: 10.2
## Mean of the outliers: 7.69
## Mean without removing outliers: 4.49
## Mean if we remove outliers: 4.16
## Outliers successfully removed
hmda_ca_final_no_outliers_4 <- outlierKD2(hmda_ca_final_no_outliers_3,property_value)
## Outliers identified: 1821
## Propotion (%) of outliers: 6.8
## Mean of the outliers: 2132309
## Mean without removing outliers: 644638
## Mean if we remove outliers: 543833
## Outliers successfully removed
hmda_ca_final_no_outliers_5 <- outlierKD2(hmda_ca_final_no_outliers_4,income)
## Outliers identified: 1110
## Propotion (%) of outliers: 4
## Mean of the outliers: 410
## Mean without removing outliers: 118
## Mean if we remove outliers: 107
## Outliers successfully removed
loans <- outlierKD2(hmda_ca_final_no_outliers_5,loan_amount)
## Outliers identified: 451
## Propotion (%) of outliers: 1.6
## Mean of the outliers: 1397129
## Mean without removing outliers: 332997
## Mean if we remove outliers: 316005
## Outliers successfully removed
We then reexamine the variables for normality after removing the outliers.
qqnorm(loans$interest_rate,
main="Graph 15. QQ Plot of Interest Rates",
ylab="Interest Rate",
col="pink")
qqline(loans$interest_rate)
qqnorm(loans$property_value,
main="Graph 16. QQ Plot of Property Values",
ylab="Property Value",
col="blue")
qqline(loans$property_value)
qqnorm(loans$income,
main="Graph 17.QQ Plot of Income",
ylab="Income",
col="green")
qqline(loans$income)
qqnorm(loans$loan_amount,
main="Graph 18. QQ Plot of Loan Amount",
ylab="Loan Amount",
col="purple")
qqline(loans$loan_amount)
The values do not appear to be normally distributed even after removing the outliers
Numerical_var <- subset(loans,select=c(loan_amount, income, property_value, interest_rate))
#str(Numerical_var)
library(kableExtra)
summary_t<-kbl(summary(Numerical_var))%>%
kable_styling()
summary_t
| loan_amount | income | property_value | interest_rate | |
|---|---|---|---|---|
| Min. : 5000 | Min. :-19 | Min. : 15000 | Min. :2 | |
| 1st Qu.:125000 | 1st Qu.: 64 | 1st Qu.: 385000 | 1st Qu.:4 | |
| Median :305000 | Median : 99 | Median : 515000 | Median :4 | |
| Mean :316005 | Mean :107 | Mean : 543833 | Mean :4 | |
| 3rd Qu.:455000 | 3rd Qu.:143 | 3rd Qu.: 665000 | 3rd Qu.:4 | |
| Max. :975000 | Max. :276 | Max. :1175000 | Max. :6 | |
| NA’s :451 | NA’s :1110 | NA’s :1821 | NA’s :2664 |
# GROUP: what's with the NAs in the summary table after we have cleaned the NAs out?
However, the means and the medians are fairly close in all instances; do we keep removing the outliers?
#this chunk is for visual inspection only: EITHER DELETE or FORMAT before the final submission
boxplot(loans$interest_rate)
boxplot(loans$loan_amount)
boxplot(loans$property_value)
boxplot(loans$income)
library(lattice)
#dim(loans) #28695 12
pairs(loans)
#str(loans)
library(epiDisplay)
tab1(loans$derived_race, sort.group = "decreasing", cum.percent = TRUE)
## loans$derived_race :
## Frequency Percent Cum. percent
## White 17022 59.3 59.3
## Race Not Available 4295 15.0 74.3
## Asian 3496 12.2 86.5
## Black or African American 1905 6.6 93.1
## Joint 1349 4.7 97.8
## American Indian or Alaska Native 258 0.9 98.7
## Native Hawaiian or Other Pacific Islander 243 0.8 99.6
## 2 or more minority races 127 0.4 100.0
## Total 28695 100.0 100.0
##Preparing data for corrplot
cor_loans<-cor(loans_all_num, use="complete.obs")
xkabledply(cor_loans)
| action_taken | loan_amount | interest_rate | property_value | income | applicant_ethnicity.1 | applicant_race.1 | applicant_sex | applicant_age | Black | AIAN | NHPI | Asian | Joint | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| action_taken | 1.0000 | -0.2118 | 0.2412 | -0.0772 | -0.1038 | 0.0134 | 0.0085 | 0.0272 | 0.0478 | 0.1045 | 0.0326 | 0.0389 | 0.0323 | 0.0028 |
| loan_amount | -0.2118 | 1.0000 | -0.2426 | 0.4430 | 0.4005 | 0.0252 | 0.0207 | -0.0666 | -0.1318 | 0.0034 | -0.0221 | -0.0066 | 0.0861 | 0.0529 |
| interest_rate | 0.2412 | -0.2426 | 1.0000 | -0.0796 | -0.1274 | -0.0007 | -0.0143 | 0.0433 | 0.0161 | 0.0430 | -0.0019 | 0.0208 | -0.0296 | -0.0263 |
| property_value | -0.0772 | 0.4430 | -0.0796 | 1.0000 | 0.4333 | 0.0192 | 0.0091 | -0.0035 | 0.0954 | -0.0409 | -0.0392 | -0.0082 | 0.0995 | 0.0297 |
| income | -0.1038 | 0.4005 | -0.1274 | 0.4333 | 1.0000 | 0.0284 | 0.0353 | -0.0794 | -0.0122 | -0.0347 | -0.0274 | -0.0116 | 0.0749 | 0.1145 |
| applicant_ethnicity.1 | 0.0134 | 0.0252 | -0.0007 | 0.0192 | 0.0284 | 1.0000 | 0.0463 | 0.0686 | 0.0064 | -0.0128 | 0.0016 | 0.0001 | 0.0150 | -0.0150 |
| applicant_race.1 | 0.0085 | 0.0207 | -0.0143 | 0.0091 | 0.0353 | 0.0463 | 1.0000 | 0.0218 | -0.0097 | -0.1579 | -0.1099 | 0.2881 | 0.0518 | -0.0153 |
| applicant_sex | 0.0272 | -0.0666 | 0.0433 | -0.0035 | -0.0794 | 0.0686 | 0.0218 | 1.0000 | 0.0292 | 0.0056 | -0.0215 | -0.0009 | -0.0403 | -0.0629 |
| applicant_age | 0.0478 | -0.1318 | 0.0161 | 0.0954 | -0.0122 | 0.0064 | -0.0097 | 0.0292 | 1.0000 | 0.0387 | -0.0089 | 0.0105 | -0.0177 | -0.0092 |
| Black | 0.1045 | 0.0034 | 0.0430 | -0.0409 | -0.0347 | -0.0128 | -0.1579 | 0.0056 | 0.0387 | 1.0000 | -0.0264 | -0.0246 | -0.1005 | -0.0598 |
| AIAN | 0.0326 | -0.0221 | -0.0019 | -0.0392 | -0.0274 | 0.0016 | -0.1099 | -0.0215 | -0.0089 | -0.0264 | 1.0000 | -0.0089 | -0.0361 | -0.0215 |
| NHPI | 0.0389 | -0.0066 | 0.0208 | -0.0082 | -0.0116 | 0.0001 | 0.2881 | -0.0009 | 0.0105 | -0.0246 | -0.0089 | 1.0000 | -0.0337 | -0.0201 |
| Asian | 0.0323 | 0.0861 | -0.0296 | 0.0995 | 0.0749 | 0.0150 | 0.0518 | -0.0403 | -0.0177 | -0.1005 | -0.0361 | -0.0337 | 1.0000 | -0.0819 |
| Joint | 0.0028 | 0.0529 | -0.0263 | 0.0297 | 0.1145 | -0.0150 | -0.0153 | -0.0629 | -0.0092 | -0.0598 | -0.0215 | -0.0201 | -0.0819 | 1.0000 |
loadPkg("corrplot")
corrplot(cor_loans)
contable1 = table(loans$derived_race, loans$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
| 1 | 3 | |
|---|---|---|
| 2 or more minority races | 75 | 52 |
| American Indian or Alaska Native | 170 | 88 |
| Asian | 2672 | 824 |
| Black or African American | 1242 | 663 |
| Joint | 1062 | 287 |
| Native Hawaiian or Other Pacific Islander | 158 | 85 |
| Race Not Available | 3287 | 1008 |
| White | 14107 | 2915 |
chitest1 = chisq.test(contable1)
chitest1
##
## Pearson's Chi-squared test
##
## data: contable1
## X-squared = 492, df = 7, p-value <2e-16
contable2 = table(loans$derived_sex, loans$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Sex")
| 1 | 3 | |
|---|---|---|
| Female | 4639 | 1346 |
| Joint | 10354 | 2065 |
| Male | 6771 | 2161 |
| Sex Not Available | 1009 | 350 |
chitest2 = chisq.test(contable2)
chitest2
##
## Pearson's Chi-squared test
##
## data: contable2
## X-squared = 225, df = 3, p-value <2e-16
contable3 = table(loans$derived_ethnicity, loans$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Ethnicity")
| 1 | 3 | |
|---|---|---|
| Ethnicity Not Available | 2891 | 830 |
| Hispanic or Latino | 4145 | 1291 |
| Joint | 1229 | 285 |
| Not Hispanic or Latino | 14508 | 3516 |
chitest3 = chisq.test(contable3)
chitest3
##
## Pearson's Chi-squared test
##
## data: contable3
## X-squared = 56, df = 3, p-value = 5e-12
contable4 = table(loans$applicant_age, loans$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
| 1 | 3 | |
|---|---|---|
| <25 | 253 | 58 |
| >74 | 1621 | 373 |
| 25-34 | 3897 | 823 |
| 35-44 | 5606 | 1417 |
| 45-54 | 4924 | 1382 |
| 55-64 | 3674 | 1089 |
| 65-74 | 2798 | 779 |
| 8888 | 0 | 1 |
chitest4 = chisq.test(contable4)
chitest4
##
## Pearson's Chi-squared test
##
## data: contable4
## X-squared = 63, df = 7, p-value = 4e-11
#adding New Code for final and downsampling
hmda <- data.frame(loans)
str(hmda$action_taken)
## int [1:28695] 1 1 1 3 1 1 1 1 3 1 ...
hmda$approval<- ifelse(hmda$action_taken=="1", "Approved","Denied")
hmda$approval<- factor(hmda$approval)
str(hmda)
## 'data.frame': 28695 obs. of 20 variables:
## $ derived_ethnicity : Factor w/ 4 levels "Ethnicity Not Available",..: 4 2 2 2 2 2 1 2 2 2 ...
## $ derived_race : Factor w/ 8 levels "2 or more minority races",..: 8 7 8 8 8 8 7 8 8 8 ...
## $ derived_sex : Factor w/ 4 levels "Female","Joint",..: 3 2 3 2 1 3 1 3 3 3 ...
## $ action_taken : int 1 1 1 3 1 1 1 1 3 1 ...
## $ loan_amount : num NA 175000 315000 115000 165000 265000 445000 335000 315000 285000 ...
## $ interest_rate : num 5.25 NA NA 4.46 NA ...
## $ property_value : num NA 235000 415000 145000 435000 415000 705000 425000 865000 405000 ...
## $ income : num NA 72 146 42 64 48 101 125 71 78 ...
## $ applicant_ethnicity.1: int 2 1 1 1 1 1 3 1 1 1 ...
## $ applicant_race.1 : int 5 6 5 5 5 5 6 5 5 5 ...
## $ applicant_sex : int 1 1 1 1 2 1 2 1 1 1 ...
## $ applicant_age : Factor w/ 8 levels "<25",">74","25-34",..: 4 4 4 3 4 4 4 5 5 4 ...
## $ Black : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AIAN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NHPI : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Joint : num 0 0 0 0 0 0 0 0 0 0 ...
## $ N_A : num 0 1 0 0 0 0 1 0 0 0 ...
## $ two_or : num 0 0 0 0 0 0 0 0 0 0 ...
## $ approval : Factor w/ 2 levels "Approved","Denied": 1 1 1 2 1 1 1 1 2 1 ...
table(hmda$approval)
##
## Approved Denied
## 22773 5922
#Approved Denied
#22773 5922
Checking frequency of approval variable and derived_race variable
with(hmda,
{
print(table(derived_race))
print(table(approval))
}
)
## derived_race
## 2 or more minority races
## 127
## American Indian or Alaska Native
## 258
## Asian
## 3496
## Black or African American
## 1905
## Joint
## 1349
## Native Hawaiian or Other Pacific Islander
## 243
## Race Not Available
## 4295
## White
## 17022
## approval
## Approved Denied
## 22773 5922
Creating DownSample of hmda dataset, the Approved far outnumber the Denied making this dataset severely unbalanced #Approved: 22773 Denied: 5922
Approved<- which(hmda$action_taken=="1")
Denied<- which(hmda$action_taken=="3")
length(Approved)
## [1] 22773
length(Denied)
## [1] 5922
approved_downsample<-sample(Approved,length(Denied))
hmda_down<- hmda[c(approved_downsample, Denied),]
str(hmda_down)
## 'data.frame': 11844 obs. of 20 variables:
## $ derived_ethnicity : Factor w/ 4 levels "Ethnicity Not Available",..: 1 4 4 4 4 3 2 4 4 4 ...
## $ derived_race : Factor w/ 8 levels "2 or more minority races",..: 7 4 8 8 4 8 8 8 8 8 ...
## $ derived_sex : Factor w/ 4 levels "Female","Joint",..: 3 1 1 3 1 2 3 2 1 2 ...
## $ action_taken : int 1 1 1 1 1 1 1 1 1 1 ...
## $ loan_amount : num 105000 245000 245000 135000 245000 125000 455000 185000 195000 615000 ...
## $ interest_rate : num NA 2.88 3.88 5.12 4.38 ...
## $ property_value : num 465000 375000 525000 225000 445000 ...
## $ income : num 154 77 118 0 78 75 101 181 79 162 ...
## $ applicant_ethnicity.1: int 3 2 2 2 2 1 1 2 2 2 ...
## $ applicant_race.1 : int 6 3 5 5 3 5 5 5 5 5 ...
## $ applicant_sex : int 1 2 2 1 2 1 1 2 2 1 ...
## $ applicant_age : Factor w/ 8 levels "<25",">74","25-34",..: 3 3 4 5 4 3 5 6 7 4 ...
## $ Black : num 0 1 0 0 1 0 0 0 0 0 ...
## $ AIAN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NHPI : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Joint : num 0 0 0 0 0 0 0 0 0 0 ...
## $ N_A : num 1 0 0 0 0 0 0 0 0 0 ...
## $ two_or : num 0 0 0 0 0 0 0 0 0 0 ...
## $ approval : Factor w/ 2 levels "Approved","Denied": 1 1 1 1 1 1 1 1 1 1 ...
hmda_downnum<-hmda_down[c(4:11, 13:17, 19)]
hmda_downnum$approved<- ifelse(hmda_downnum$action_taken=="1", 1, 0)
hmda_downnum$denied<- ifelse(hmda_downnum$action_taken=="3", 1, 0)
str(hmda_downnum)
## 'data.frame': 11844 obs. of 16 variables:
## $ action_taken : int 1 1 1 1 1 1 1 1 1 1 ...
## $ loan_amount : num 105000 245000 245000 135000 245000 125000 455000 185000 195000 615000 ...
## $ interest_rate : num NA 2.88 3.88 5.12 4.38 ...
## $ property_value : num 465000 375000 525000 225000 445000 ...
## $ income : num 154 77 118 0 78 75 101 181 79 162 ...
## $ applicant_ethnicity.1: int 3 2 2 2 2 1 1 2 2 2 ...
## $ applicant_race.1 : int 6 3 5 5 3 5 5 5 5 5 ...
## $ applicant_sex : int 1 2 2 1 2 1 1 2 2 1 ...
## $ Black : num 0 1 0 0 1 0 0 0 0 0 ...
## $ AIAN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NHPI : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Joint : num 0 0 0 0 0 0 0 0 0 0 ...
## $ two_or : num 0 0 0 0 0 0 0 0 0 0 ...
## $ approved : num 1 1 1 1 1 1 1 1 1 1 ...
## $ denied : num 0 0 0 0 0 0 0 0 0 0 ...
cor_loans<-cor(hmda_downnum, use="complete.obs")
xkabledply(cor_loans)
| action_taken | loan_amount | interest_rate | property_value | income | applicant_ethnicity.1 | applicant_race.1 | applicant_sex | Black | AIAN | NHPI | Asian | Joint | two_or | approved | denied | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| action_taken | 1.0000 | -0.2425 | 0.3537 | -0.0972 | -0.1224 | 0.0067 | 0.0055 | 0.0261 | 0.1208 | 0.0348 | 0.0383 | 0.0400 | 0.0033 | 0.0412 | -1.0000 | 1.0000 |
| loan_amount | -0.2425 | 1.0000 | -0.2344 | 0.4319 | 0.3160 | 0.0050 | 0.0214 | -0.0522 | -0.0141 | -0.0197 | -0.0104 | 0.0522 | 0.0362 | -0.0108 | 0.2425 | -0.2425 |
| interest_rate | 0.3537 | -0.2344 | 1.0000 | -0.0907 | -0.1297 | 0.0033 | -0.0137 | 0.0424 | 0.0556 | -0.0072 | 0.0172 | -0.0159 | -0.0203 | 0.0228 | -0.3537 | 0.3537 |
| property_value | -0.0972 | 0.4319 | -0.0907 | 1.0000 | 0.3938 | 0.0111 | 0.0241 | -0.0021 | -0.0330 | -0.0351 | 0.0037 | 0.0955 | 0.0183 | -0.0238 | 0.0972 | -0.0972 |
| income | -0.1224 | 0.3160 | -0.1297 | 0.3938 | 1.0000 | 0.0285 | 0.0369 | -0.1041 | -0.0294 | -0.0211 | -0.0109 | 0.0588 | 0.1286 | -0.0045 | 0.1224 | -0.1224 |
| applicant_ethnicity.1 | 0.0067 | 0.0050 | 0.0033 | 0.0111 | 0.0285 | 1.0000 | 0.0521 | 0.0656 | -0.0172 | -0.0035 | 0.0034 | 0.0288 | 0.0015 | -0.0016 | -0.0067 | 0.0067 |
| applicant_race.1 | 0.0055 | 0.0214 | -0.0137 | 0.0241 | 0.0369 | 0.0521 | 1.0000 | 0.0158 | -0.1625 | -0.1111 | 0.3513 | 0.0666 | -0.0189 | -0.0607 | -0.0055 | 0.0055 |
| applicant_sex | 0.0261 | -0.0522 | 0.0424 | -0.0021 | -0.1041 | 0.0656 | 0.0158 | 1.0000 | -0.0002 | -0.0248 | -0.0047 | -0.0407 | -0.0717 | -0.0087 | -0.0261 | 0.0261 |
| Black | 0.1208 | -0.0141 | 0.0556 | -0.0330 | -0.0294 | -0.0172 | -0.1625 | -0.0002 | 1.0000 | -0.0332 | -0.0321 | -0.1168 | -0.0678 | -0.0246 | -0.1208 | 0.1208 |
| AIAN | 0.0348 | -0.0197 | -0.0072 | -0.0351 | -0.0211 | -0.0035 | -0.1111 | -0.0248 | -0.0332 | 1.0000 | -0.0114 | -0.0416 | -0.0241 | -0.0087 | -0.0348 | 0.0348 |
| NHPI | 0.0383 | -0.0104 | 0.0172 | 0.0037 | -0.0109 | 0.0034 | 0.3513 | -0.0047 | -0.0321 | -0.0114 | 1.0000 | -0.0402 | -0.0233 | -0.0084 | -0.0383 | 0.0383 |
| Asian | 0.0400 | 0.0522 | -0.0159 | 0.0955 | 0.0588 | 0.0288 | 0.0666 | -0.0407 | -0.1168 | -0.0416 | -0.0402 | 1.0000 | -0.0849 | -0.0308 | -0.0400 | 0.0400 |
| Joint | 0.0033 | 0.0362 | -0.0203 | 0.0183 | 0.1286 | 0.0015 | -0.0189 | -0.0717 | -0.0678 | -0.0241 | -0.0233 | -0.0849 | 1.0000 | -0.0178 | -0.0033 | 0.0033 |
| two_or | 0.0412 | -0.0108 | 0.0228 | -0.0238 | -0.0045 | -0.0016 | -0.0607 | -0.0087 | -0.0246 | -0.0087 | -0.0084 | -0.0308 | -0.0178 | 1.0000 | -0.0412 | 0.0412 |
| approved | -1.0000 | 0.2425 | -0.3537 | 0.0972 | 0.1224 | -0.0067 | -0.0055 | -0.0261 | -0.1208 | -0.0348 | -0.0383 | -0.0400 | -0.0033 | -0.0412 | 1.0000 | -1.0000 |
| denied | 1.0000 | -0.2425 | 0.3537 | -0.0972 | -0.1224 | 0.0067 | 0.0055 | 0.0261 | 0.1208 | 0.0348 | 0.0383 | 0.0400 | 0.0033 | 0.0412 | -1.0000 | 1.0000 |
loadPkg("corrplot")
corrplot(cor_loans)
Conducting Chi-squared Test on the downsampled dataset (hmda_down)
P-Value for contable1 is lower than .05 therefore we reject the null hypothesis. action_taken and derived_race are not independent of each other
contable1= table(hmda_down$derived_race, hmda_down$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
| 1 | 3 | |
|---|---|---|
| 2 or more minority races | 16 | 52 |
| American Indian or Alaska Native | 42 | 88 |
| Asian | 691 | 824 |
| Black or African American | 287 | 663 |
| Joint | 274 | 287 |
| Native Hawaiian or Other Pacific Islander | 46 | 85 |
| Race Not Available | 822 | 1008 |
| White | 3744 | 2915 |
chitest1 = chisq.test(contable1)
chitest1
##
## Pearson's Chi-squared test
##
## data: contable1
## X-squared = 330, df = 7, p-value <2e-16
contable2= table(hmda_down$derived_ethnicity, hmda_down$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Ethnicity")
| 1 | 3 | |
|---|---|---|
| Ethnicity Not Available | 726 | 830 |
| Hispanic or Latino | 1050 | 1291 |
| Joint | 341 | 285 |
| Not Hispanic or Latino | 3805 | 3516 |
chitest2 = chisq.test(contable2)
chitest2
##
## Pearson's Chi-squared test
##
## data: contable2
## X-squared = 48, df = 3, p-value = 2e-10
contable3= table(hmda_down$derived_sex, hmda_down$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Gender")
| 1 | 3 | |
|---|---|---|
| Female | 1202 | 1346 |
| Joint | 2682 | 2065 |
| Male | 1757 | 2161 |
| Sex Not Available | 281 | 350 |
chitest3 = chisq.test(contable3)
chitest3
##
## Pearson's Chi-squared test
##
## data: contable3
## X-squared = 138, df = 3, p-value <2e-16
contable4= table(hmda_down$applicant_age, hmda_down$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
| 1 | 3 | |
|---|---|---|
| <25 | 69 | 58 |
| >74 | 441 | 373 |
| 25-34 | 1046 | 823 |
| 35-44 | 1442 | 1417 |
| 45-54 | 1241 | 1382 |
| 55-64 | 927 | 1089 |
| 65-74 | 756 | 779 |
| 8888 | 0 | 1 |
chitest4 = chisq.test(contable4)
chitest4
##
## Pearson's Chi-squared test
##
## data: contable4
## X-squared = 55, df = 7, p-value = 1e-09
full_model <- lm(action_taken~., data = loans)
summary(full_model)
##
## Call:
## lm(formula = action_taken ~ ., data = loans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6631 -0.4953 -0.2729 0.0613 2.1640
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error
## (Intercept) 9.77e-01 1.05e-01
## derived_ethnicityHispanic or Latino 1.04e-01 2.49e-02
## derived_ethnicityJoint 1.08e-01 3.27e-02
## derived_ethnicityNot Hispanic or Latino 1.28e-02 2.46e-02
## derived_raceAmerican Indian or Alaska Native -2.46e-01 9.26e-02
## derived_raceAsian -3.36e-01 7.74e-02
## derived_raceBlack or African American -1.33e-01 7.84e-02
## derived_raceJoint -3.56e-01 8.00e-02
## derived_raceNative Hawaiian or Other Pacific Islander -2.03e-01 9.67e-02
## derived_raceRace Not Available -4.13e-01 7.90e-02
## derived_raceWhite -5.55e-01 7.63e-02
## derived_sexJoint 6.26e-03 1.98e-02
## derived_sexMale 1.09e-01 2.29e-02
## derived_sexSex Not Available 1.30e-01 3.54e-02
## loan_amount -8.11e-07 3.25e-08
## interest_rate 2.39e-01 7.67e-03
## property_value 2.05e-07 3.10e-08
## income -8.06e-04 1.14e-04
## applicant_ethnicity.1 3.22e-03 3.47e-03
## applicant_race.1 4.28e-03 1.49e-03
## applicant_sex 1.29e-02 1.76e-02
## applicant_age>74 -1.42e-01 5.04e-02
## applicant_age25-34 3.76e-02 4.67e-02
## applicant_age35-44 1.05e-01 4.65e-02
## applicant_age45-54 1.18e-01 4.68e-02
## applicant_age55-64 8.29e-02 4.72e-02
## applicant_age65-74 -8.71e-03 4.80e-02
## Black NA NA
## AIAN NA NA
## NHPI NA NA
## Asian NA NA
## Joint NA NA
## N_A NA NA
## two_or NA NA
## t value Pr(>|t|)
## (Intercept) 9.32 < 2e-16 ***
## derived_ethnicityHispanic or Latino 4.18 3.0e-05 ***
## derived_ethnicityJoint 3.29 0.00101 **
## derived_ethnicityNot Hispanic or Latino 0.52 0.60210
## derived_raceAmerican Indian or Alaska Native -2.66 0.00784 **
## derived_raceAsian -4.33 1.5e-05 ***
## derived_raceBlack or African American -1.70 0.09002 .
## derived_raceJoint -4.45 8.7e-06 ***
## derived_raceNative Hawaiian or Other Pacific Islander -2.10 0.03599 *
## derived_raceRace Not Available -5.23 1.7e-07 ***
## derived_raceWhite -7.27 3.7e-13 ***
## derived_sexJoint 0.32 0.75170
## derived_sexMale 4.74 2.1e-06 ***
## derived_sexSex Not Available 3.68 0.00023 ***
## loan_amount -24.97 < 2e-16 ***
## interest_rate 31.10 < 2e-16 ***
## property_value 6.60 4.2e-11 ***
## income -7.06 1.7e-12 ***
## applicant_ethnicity.1 0.93 0.35368
## applicant_race.1 2.87 0.00415 **
## applicant_sex 0.73 0.46523
## applicant_age>74 -2.82 0.00478 **
## applicant_age25-34 0.81 0.42031
## applicant_age35-44 2.25 0.02437 *
## applicant_age45-54 2.52 0.01161 *
## applicant_age55-64 1.76 0.07914 .
## applicant_age65-74 -0.18 0.85611
## Black NA NA
## AIAN NA NA
## NHPI NA NA
## Asian NA NA
## Joint NA NA
## N_A NA NA
## two_or NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.788 on 23875 degrees of freedom
## (4793 observations deleted due to missingness)
## Multiple R-squared: 0.12, Adjusted R-squared: 0.119
## F-statistic: 125 on 26 and 23875 DF, p-value: <2e-16
loans_for_modelling <- loans[1:12]
full_model_2 <- lm(action_taken~., data = loans_for_modelling)
summary(full_model_2)
##
## Call:
## lm(formula = action_taken ~ ., data = loans_for_modelling)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6631 -0.4953 -0.2729 0.0613 2.1640
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 9.77e-01 1.05e-01
## derived_ethnicityHispanic or Latino 1.04e-01 2.49e-02
## derived_ethnicityJoint 1.08e-01 3.27e-02
## derived_ethnicityNot Hispanic or Latino 1.28e-02 2.46e-02
## derived_raceAmerican Indian or Alaska Native -2.46e-01 9.26e-02
## derived_raceAsian -3.36e-01 7.74e-02
## derived_raceBlack or African American -1.33e-01 7.84e-02
## derived_raceJoint -3.56e-01 8.00e-02
## derived_raceNative Hawaiian or Other Pacific Islander -2.03e-01 9.67e-02
## derived_raceRace Not Available -4.13e-01 7.90e-02
## derived_raceWhite -5.55e-01 7.63e-02
## derived_sexJoint 6.26e-03 1.98e-02
## derived_sexMale 1.09e-01 2.29e-02
## derived_sexSex Not Available 1.30e-01 3.54e-02
## loan_amount -8.11e-07 3.25e-08
## interest_rate 2.39e-01 7.67e-03
## property_value 2.05e-07 3.10e-08
## income -8.06e-04 1.14e-04
## applicant_ethnicity.1 3.22e-03 3.47e-03
## applicant_race.1 4.28e-03 1.49e-03
## applicant_sex 1.29e-02 1.76e-02
## applicant_age>74 -1.42e-01 5.04e-02
## applicant_age25-34 3.76e-02 4.67e-02
## applicant_age35-44 1.05e-01 4.65e-02
## applicant_age45-54 1.18e-01 4.68e-02
## applicant_age55-64 8.29e-02 4.72e-02
## applicant_age65-74 -8.71e-03 4.80e-02
## t value Pr(>|t|)
## (Intercept) 9.32 < 2e-16 ***
## derived_ethnicityHispanic or Latino 4.18 3.0e-05 ***
## derived_ethnicityJoint 3.29 0.00101 **
## derived_ethnicityNot Hispanic or Latino 0.52 0.60210
## derived_raceAmerican Indian or Alaska Native -2.66 0.00784 **
## derived_raceAsian -4.33 1.5e-05 ***
## derived_raceBlack or African American -1.70 0.09002 .
## derived_raceJoint -4.45 8.7e-06 ***
## derived_raceNative Hawaiian or Other Pacific Islander -2.10 0.03599 *
## derived_raceRace Not Available -5.23 1.7e-07 ***
## derived_raceWhite -7.27 3.7e-13 ***
## derived_sexJoint 0.32 0.75170
## derived_sexMale 4.74 2.1e-06 ***
## derived_sexSex Not Available 3.68 0.00023 ***
## loan_amount -24.97 < 2e-16 ***
## interest_rate 31.10 < 2e-16 ***
## property_value 6.60 4.2e-11 ***
## income -7.06 1.7e-12 ***
## applicant_ethnicity.1 0.93 0.35368
## applicant_race.1 2.87 0.00415 **
## applicant_sex 0.73 0.46523
## applicant_age>74 -2.82 0.00478 **
## applicant_age25-34 0.81 0.42031
## applicant_age35-44 2.25 0.02437 *
## applicant_age45-54 2.52 0.01161 *
## applicant_age55-64 1.76 0.07914 .
## applicant_age65-74 -0.18 0.85611
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.788 on 23875 degrees of freedom
## (4793 observations deleted due to missingness)
## Multiple R-squared: 0.12, Adjusted R-squared: 0.119
## F-statistic: 125 on 26 and 23875 DF, p-value: <2e-16
#{r} #pcr.fit=pcr(action_taken~.,data=loans_for_modelling,scale=FALSE,validation ="CV") #ezids::xkabledplyhead(loans_for_modelling) #summary(pcr.fit) #
str(loans_for_modelling)
## 'data.frame': 28695 obs. of 12 variables:
## $ derived_ethnicity : Factor w/ 4 levels "Ethnicity Not Available",..: 4 2 2 2 2 2 1 2 2 2 ...
## $ derived_race : Factor w/ 8 levels "2 or more minority races",..: 8 7 8 8 8 8 7 8 8 8 ...
## $ derived_sex : Factor w/ 4 levels "Female","Joint",..: 3 2 3 2 1 3 1 3 3 3 ...
## $ action_taken : int 1 1 1 3 1 1 1 1 3 1 ...
## $ loan_amount : num NA 175000 315000 115000 165000 265000 445000 335000 315000 285000 ...
## $ interest_rate : num 5.25 NA NA 4.46 NA ...
## $ property_value : num NA 235000 415000 145000 435000 415000 705000 425000 865000 405000 ...
## $ income : num NA 72 146 42 64 48 101 125 71 78 ...
## $ applicant_ethnicity.1: int 2 1 1 1 1 1 3 1 1 1 ...
## $ applicant_race.1 : int 5 6 5 5 5 5 6 5 5 5 ...
## $ applicant_sex : int 1 1 1 1 2 1 2 1 1 1 ...
## $ applicant_age : Factor w/ 8 levels "<25",">74","25-34",..: 4 4 4 3 4 4 4 5 5 4 ...
#{r} #loans_for_modelling = loans_for_modelling #loans_for_modelling$action_taken = #factor(loans_for_modelling$action_taken) #
action_taken_fit1 <- lm(action_taken ~ derived_race, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit1)
##
## Call:
## lm(formula = action_taken ~ derived_race, data = loans_for_modelling,
## family = "binomial")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.819 -0.469 -0.343 -0.343 1.657
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.8189 0.0712
## derived_raceAmerican Indian or Alaska Native -0.1367 0.0870
## derived_raceAsian -0.3475 0.0725
## derived_raceBlack or African American -0.1228 0.0735
## derived_raceJoint -0.3934 0.0745
## derived_raceNative Hawaiian or Other Pacific Islander -0.1193 0.0879
## derived_raceRace Not Available -0.3495 0.0723
## derived_raceWhite -0.4764 0.0715
## t value Pr(>|t|)
## (Intercept) 25.54 < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native -1.57 0.116
## derived_raceAsian -4.79 1.6e-06 ***
## derived_raceBlack or African American -1.67 0.095 .
## derived_raceJoint -5.28 1.3e-07 ***
## derived_raceNative Hawaiian or Other Pacific Islander -1.36 0.175
## derived_raceRace Not Available -4.84 1.3e-06 ***
## derived_raceWhite -6.66 2.7e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.803 on 28687 degrees of freedom
## Multiple R-squared: 0.0172, Adjusted R-squared: 0.0169
## F-statistic: 71.6 on 7 and 28687 DF, p-value: <2e-16
xkablevif(action_taken_fit1)
| derived_raceAmerican Indian or Alaska Native | derived_raceAsian | derived_raceBlack or African American | derived_raceJoint | derived_raceNative Hawaiian or Other Pacific Islander | derived_raceRace Not Available | derived_raceWhite |
|---|---|---|---|---|---|---|
| 3 | 25.1 | 14.9 | 11.1 | 2.89 | 29.6 | 54.9 |
action_taken_fit2 <- lm(action_taken ~ derived_race + derived_ethnicity, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit2)
##
## Call:
## lm(formula = action_taken ~ derived_race + derived_ethnicity,
## data = loans_for_modelling, family = "binomial")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.932 -0.452 -0.310 -0.310 1.690
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.8324 0.0742
## derived_raceAmerican Indian or Alaska Native -0.1938 0.0871
## derived_raceAsian -0.3391 0.0724
## derived_raceBlack or African American -0.1151 0.0734
## derived_raceJoint -0.3905 0.0745
## derived_raceNative Hawaiian or Other Pacific Islander -0.1338 0.0878
## derived_raceRace Not Available -0.3800 0.0739
## derived_raceWhite -0.4956 0.0714
## derived_ethnicityHispanic or Latino 0.0998 0.0218
## derived_ethnicityJoint -0.0151 0.0292
## derived_ethnicityNot Hispanic or Latino -0.0265 0.0217
## t value Pr(>|t|)
## (Intercept) 24.68 < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native -2.23 0.026 *
## derived_raceAsian -4.68 2.8e-06 ***
## derived_raceBlack or African American -1.57 0.117
## derived_raceJoint -5.24 1.6e-07 ***
## derived_raceNative Hawaiian or Other Pacific Islander -1.52 0.127
## derived_raceRace Not Available -5.14 2.7e-07 ***
## derived_raceWhite -6.94 4.0e-12 ***
## derived_ethnicityHispanic or Latino 4.58 4.8e-06 ***
## derived_ethnicityJoint -0.52 0.606
## derived_ethnicityNot Hispanic or Latino -1.22 0.223
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.801 on 28684 degrees of freedom
## Multiple R-squared: 0.0205, Adjusted R-squared: 0.0201
## F-statistic: 59.9 on 10 and 28684 DF, p-value: <2e-16
action_taken_fit3 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit3)
##
## Call:
## lm(formula = action_taken ~ derived_race + derived_ethnicity +
## derived_sex, data = loans_for_modelling, family = "binomial")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.967 -0.460 -0.348 -0.243 1.757
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.81615 0.07483
## derived_raceAmerican Indian or Alaska Native -0.19405 0.08684
## derived_raceAsian -0.31991 0.07220
## derived_raceBlack or African American -0.10896 0.07324
## derived_raceJoint -0.31267 0.07460
## derived_raceNative Hawaiian or Other Pacific Islander -0.11520 0.08752
## derived_raceRace Not Available -0.36723 0.07377
## derived_raceWhite -0.46918 0.07124
## derived_ethnicityHispanic or Latino 0.11279 0.02292
## derived_ethnicityJoint 0.05885 0.03017
## derived_ethnicityNot Hispanic or Latino -0.00361 0.02269
## derived_sexJoint -0.10063 0.01303
## derived_sexMale 0.03765 0.01338
## derived_sexSex Not Available 0.06637 0.02840
## t value Pr(>|t|)
## (Intercept) 24.27 < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native -2.23 0.0255 *
## derived_raceAsian -4.43 9.4e-06 ***
## derived_raceBlack or African American -1.49 0.1368
## derived_raceJoint -4.19 2.8e-05 ***
## derived_raceNative Hawaiian or Other Pacific Islander -1.32 0.1881
## derived_raceRace Not Available -4.98 6.5e-07 ***
## derived_raceWhite -6.59 4.6e-11 ***
## derived_ethnicityHispanic or Latino 4.92 8.7e-07 ***
## derived_ethnicityJoint 1.95 0.0511 .
## derived_ethnicityNot Hispanic or Latino -0.16 0.8736
## derived_sexJoint -7.72 1.2e-14 ***
## derived_sexMale 2.81 0.0049 **
## derived_sexSex Not Available 2.34 0.0194 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.799 on 28681 degrees of freedom
## Multiple R-squared: 0.0261, Adjusted R-squared: 0.0256
## F-statistic: 59.1 on 13 and 28681 DF, p-value: <2e-16
xkablevif(action_taken_fit3)
| derived_ethnicityHispanic or Latino | derived_ethnicityJoint | derived_ethnicityNot Hispanic or Latino | derived_raceAmerican Indian or Alaska Native | derived_raceAsian | derived_raceBlack or African American | derived_raceJoint | derived_raceNative Hawaiian or Other Pacific Islander | derived_raceRace Not Available | derived_raceWhite | derived_sexJoint | derived_sexMale | derived_sexSex Not Available |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3.02 | 25.1 | 14.9 | 11.2 | 2.89 | 31.1 | 55 | 3.63 | 2.04 | 5.4 | 1.87 | 1.73 | 1.64 |
action_taken_fit4 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex + applicant_age, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit4)
##
## Call:
## lm(formula = action_taken ~ derived_race + derived_ethnicity +
## derived_sex + applicant_age, data = loans_for_modelling,
## family = "binomial")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.038 -0.443 -0.351 -0.217 1.838
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.76283 0.08712
## derived_raceAmerican Indian or Alaska Native -0.20408 0.08674
## derived_raceAsian -0.32347 0.07211
## derived_raceBlack or African American -0.12462 0.07316
## derived_raceJoint -0.31976 0.07451
## derived_raceNative Hawaiian or Other Pacific Islander -0.12763 0.08742
## derived_raceRace Not Available -0.37216 0.07369
## derived_raceWhite -0.48170 0.07116
## derived_ethnicityHispanic or Latino 0.12496 0.02296
## derived_ethnicityJoint 0.07118 0.03018
## derived_ethnicityNot Hispanic or Latino -0.00347 0.02266
## derived_sexJoint -0.09519 0.01309
## derived_sexMale 0.04860 0.01353
## derived_sexSex Not Available 0.07155 0.02845
## applicant_age>74 0.04600 0.04885
## applicant_age25-34 -0.02077 0.04675
## applicant_age35-44 0.03461 0.04629
## applicant_age45-54 0.07294 0.04641
## applicant_age55-64 0.10195 0.04680
## applicant_age65-74 0.10573 0.04732
## applicant_age8888 1.53778 0.79949
## t value Pr(>|t|)
## (Intercept) 20.24 < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native -2.35 0.01864 *
## derived_raceAsian -4.49 7.3e-06 ***
## derived_raceBlack or African American -1.70 0.08854 .
## derived_raceJoint -4.29 1.8e-05 ***
## derived_raceNative Hawaiian or Other Pacific Islander -1.46 0.14431
## derived_raceRace Not Available -5.05 4.4e-07 ***
## derived_raceWhite -6.77 1.3e-11 ***
## derived_ethnicityHispanic or Latino 5.44 5.3e-08 ***
## derived_ethnicityJoint 2.36 0.01835 *
## derived_ethnicityNot Hispanic or Latino -0.15 0.87842
## derived_sexJoint -7.27 3.7e-13 ***
## derived_sexMale 3.59 0.00033 ***
## derived_sexSex Not Available 2.52 0.01190 *
## applicant_age>74 0.94 0.34641
## applicant_age25-34 -0.44 0.65686
## applicant_age35-44 0.75 0.45472
## applicant_age45-54 1.57 0.11604
## applicant_age55-64 2.18 0.02939 *
## applicant_age65-74 2.23 0.02548 *
## applicant_age8888 1.92 0.05443 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.798 on 28674 degrees of freedom
## Multiple R-squared: 0.0289, Adjusted R-squared: 0.0282
## F-statistic: 42.7 on 20 and 28674 DF, p-value: <2e-16
xkablevif(action_taken_fit4)
| applicant_age>74 | applicant_age25-34 | applicant_age35-44 | applicant_age45-54 | applicant_age55-64 | applicant_age65-74 | applicant_age8888 | derived_ethnicityHispanic or Latino | derived_ethnicityJoint | derived_ethnicityNot Hispanic or Latino | derived_raceAmerican Indian or Alaska Native | derived_raceAsian | derived_raceBlack or African American | derived_raceJoint | derived_raceNative Hawaiian or Other Pacific Islander | derived_raceRace Not Available | derived_raceWhite | derived_sexJoint | derived_sexMale | derived_sexSex Not Available |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3.02 | 25.1 | 15 | 11.2 | 2.89 | 31.1 | 55.1 | 3.65 | 2.05 | 5.41 | 1.9 | 1.77 | 1.65 | 6.96 | 13.5 | 17.9 | 16.6 | 13.7 | 11 | 1 |
The multiple R^2s increase with each successive model, indicating slight improvements from action_taken_fit1 through action_taken_fit4.
anova(action_taken_fit1,action_taken_fit2,action_taken_fit3,action_taken_fit4) -> anovaRes
str(anovaRes)
## Classes 'anova' and 'data.frame': 4 obs. of 6 variables:
## $ Res.Df : num 28687 28684 28681 28674
## $ RSS : num 18477 18415 18309 18256
## $ Df : num NA 3 3 7
## $ Sum of Sq: num NA 61.9 105.7 53.5
## $ F : num NA 32.4 55.3 12
## $ Pr(>F) : num NA 6.41e-21 1.15e-35 2.23e-15
## - attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Model 1: action_taken ~ derived_race\nModel 2: action_taken ~ derived_race + derived_ethnicity\nModel 3: action"| __truncated__
xkabledply(anovaRes, title = "ANOVA comparison between the models")
| Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
|---|---|---|---|---|---|
| 28687 | 18477 | NA | NA | NA | NA |
| 28684 | 18415 | 3 | 61.9 | 32.4 | 0 |
| 28681 | 18309 | 3 | 105.7 | 55.3 | 0 |
| 28674 | 18256 | 7 | 53.5 | 12.0 | 0 |
#SideNotes on ANOVA Testing These are some constructs of my ANOVA test I left them here just incase I need to revist them, these might not be perfect because I ran multiple different variants, I assume it will not be used ultimately and if we need too we can run chi-squared on the balanced dataset.
Conducting ANOVA Test on action_taken variable and income, saved test as anova
anova= aov(income ~ approval, data=hmda) str(anova)
#Plotting ANOVA TEST
loadPkg(“ggplot2”) ggplot(hmda, aes(x=approval, y=income)) + geom_boxplot( colour=c(“#ff0000” ,“#11cc11”)) + labs(title=“Income difference between Approved and Denied Apps”, x=“Approval Status”, y = “Income”)
#plot(income ~ action_taken, data=hmda) anova= aov(action_taken ~ Black, data=hmda) # anovaRes # this does not give the easy-to-read result of the aov analysis names(anova) # summary(anovaRes) xkabledply(anova) # same exact result with or without re-ordering.